# !wget -O moviedataset.zip https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%205/data/moviedataset.zip --no-check-certificate
# print('unziping ...')
# !unzip moviedataset.zip

#Dataframe manipulation library
import pandas as pd
#Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#Storing the movie information into a pandas dataframe
movies_df = pd.read_csv('ml-latest/movies.csv')

#Head is a function that gets the first N rows of a dataframe. N's default is 5.
movies_df.head()

#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv('ml-latest/ratings.csv')
#Head is a function that gets the first N rows of a dataframe. N's default is 5.
ratings_df.head()

#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '', regex=True)
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

#Every genre is separated by a | so we simply have to call the split function on |
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

#Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
moviesWithGenres_df = movies_df.copy()

#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

ratings_df.head()

#Drop removes a specified row or column from a dataframe
ratings_df = ratings_df.drop(labels = 'timestamp', axis = 1)
ratings_df.head()

userInput = [
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Breakfast Club, The', 'rating':2},
            {'title':'Prestige, The', 'rating':4.5},
            {'title':'Interstellar', 'rating':5},
            {'title':"Dark Knight, The", 'rating':5},
            {'title':'Akira', 'rating':2.5},
            {'title':'Batman Begins', 'rating':4.5},
            {'title':'Iron Man', 'rating':3.5},
            {'title':'Inception', 'rating':4.5}         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop(labels = 'genres', axis = 1).drop(labels = 'year', axis =1)
#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.
inputMovies

#Filtering out the movies from the input
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'].tolist())]
userMovies

#Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)
#Dropping unnecessary issues due to save memory and to avoid issues
userGenreTable = userMovies.drop(labels = 'movieId', axis = 1).drop(labels = 'title', axis = 1).drop(labels ='genres', axis = 1).drop(labels = 'year', axis = 1)
userGenreTable

inputMovies['rating']

0     3.5
1     2.0
2     5.0
3     2.5
4     2.0
5     4.5
6     4.5
7     5.0
8     3.5
9     4.5
10    5.0
Name: rating, dtype: float64

#Dot produt to get weights
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])
#The user profile
userProfile

Adventure             11.5
Animation              6.0
Children               5.5
Comedy                10.5
Fantasy                5.5
Romance                0.0
Drama                 21.0
Action                20.0
Crime                 19.0
Thriller              14.0
Horror                 0.0
Mystery                9.0
Sci-Fi                20.0
IMAX                  19.0
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

#Now let's get the genres of every movie in our original dataframe
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])
#And drop the unnecessary information
genreTable = genreTable.drop(labels = 'movieId', axis = 1).drop(labels = 'title', axis = 1).drop(labels = 'genres', axis = 1).drop(labels = 'year', axis = 1)
genreTable.head()

genreTable.shape

(34208, 20)

#Multiply the genres by the weights and then take the weighted average
recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_df.head()

movieId
1    0.242236
2    0.139752
3    0.065217
4    0.195652
5    0.065217
dtype: float64

#Sort our recommendations in descending order
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
#Just a peek at the values
recommendationTable_df.head()

movieId
79132     0.757764
5018      0.686335
26701     0.677019
115479    0.655280
81132     0.652174
dtype: float64

#The final recommendation table
movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(20).keys())]

	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
4	5	Father of the Bride Part II (1995)	Comedy

	userId	movieId	rating	timestamp
0	1	169	2.5	1204927694
1	1	2471	3.0	1204927438
2	1	48516	5.0	1204927435
3	2	2571	3.5	1436165433
4	2	109487	4.0	1436165496

	movieId	title	genres	year
0	1	Toy Story	Adventure\|Animation\|Children\|Comedy\|Fantasy	1995
1	2	Jumanji	Adventure\|Children\|Fantasy	1995
2	3	Grumpier Old Men	Comedy\|Romance	1995
3	4	Waiting to Exhale	Comedy\|Drama\|Romance	1995
4	5	Father of the Bride Part II	Comedy	1995

	movieId	title	genres	year
0	1	Toy Story	[Adventure, Animation, Children, Comedy, Fantasy]	1995
1	2	Jumanji	[Adventure, Children, Fantasy]	1995
2	3	Grumpier Old Men	[Comedy, Romance]	1995
3	4	Waiting to Exhale	[Comedy, Drama, Romance]	1995
4	5	Father of the Bride Part II	[Comedy]	1995

	userId	movieId	rating	timestamp
0	1	169	2.5	1204927694
1	1	2471	3.0	1204927438
2	1	48516	5.0	1204927435
3	2	2571	3.5	1436165433
4	2	109487	4.0	1436165496

Content-Based Filtering: My Approach¶

Author: Mohammad Sayem Chowdhury¶

Objectives¶

Table of contents¶

Acquiring the Data¶

Preprocessing¶

Content-Based recommendation system¶

Add movieId to input user¶

Advantages and Disadvantages of Content-Based Filtering¶

Advantages¶

Disadvantages¶

	title	rating
0	Toy Story	3.5
1	Jumanji	2.0
2	Pulp Fiction	5.0
3	Breakfast Club, The	2.0
4	Prestige, The	4.5
5	Interstellar	5.0
6	Dark Knight, The	5.0
7	Akira	2.5
8	Batman Begins	4.5
9	Iron Man	3.5
10	Inception	4.5

	Adventure	Animation	Children	Comedy	Fantasy	Drama	Action	Crime	Thriller	Mystery	Sci-Fi	IMAX
0	1.0	1.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	1.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	1.0	0.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0
3	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0
4	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
5	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0
6	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	1.0	1.0	0.0
7	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0	1.0
8	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0
9	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
10	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0

	movieId	title	genres	year
196	198	Strange Days	[Action, Crime, Drama, Mystery, Sci-Fi, Thriller]	1995
515	519	RoboCop 3	[Action, Crime, Drama, Sci-Fi, Thriller]	1993
2900	2985	RoboCop	[Action, Crime, Drama, Sci-Fi, Thriller]	1987
4923	5018	Motorama	[Adventure, Comedy, Crime, Drama, Fantasy, Mys...	1991
9000	26701	Patlabor: The Movie (Kidô keisatsu patorebâ: T...	[Action, Animation, Crime, Drama, Film-Noir, M...	1989
10427	37462	Wedlock	[Action, Crime, Drama, Sci-Fi, Thriller]	1991
12720	59844	Honor Among Thieves (Adieu l'ami) (Farewell, F...	[Action, Adventure, Crime, Drama, Mystery, Thr...	1968
12873	60684	Watchmen	[Action, Drama, Mystery, Sci-Fi, Thriller, IMAX]	2009
13250	64645	The Wrecking Crew	[Action, Adventure, Comedy, Crime, Drama, Thri...	1968
13552	67070	Army of One (Joshua Tree)	[Action, Adventure, Crime, Drama, Mystery, Thr...	1993
14397	71999	Aelita: The Queen of Mars (Aelita)	[Action, Adventure, Drama, Fantasy, Romance, S...	1924
15001	75408	Lupin III: Sweet Lost Night (Rupan Sansei: Swe...	[Action, Animation, Comedy, Crime, Drama, Myst...	2008
15073	76153	Lupin III: First Contact (Rupan Sansei: Faasut...	[Action, Animation, Comedy, Crime, Drama, Myst...	2002
15562	79132	Inception	[Action, Crime, Drama, Mystery, Sci-Fi, Thrill...	2010
16055	81132	Rubber	[Action, Adventure, Comedy, Crime, Drama, Film...	2010
16884	85261	Mars Needs Moms	[Action, Adventure, Animation, Children, Comed...	2011
17152	86644	Fast Five (Fast and the Furious 5, The)	[Action, Crime, Drama, Thriller, IMAX]	2011
21403	103651	Tai Chi Hero	[Action, Comedy, Drama, Fantasy, Sci-Fi, IMAX]	2012
24565	115479	Whip Hand, The	[Action, Adventure, Crime, Drama, Sci-Fi, Thri...	1951
26442	122787	The 39 Steps	[Action, Adventure, Comedy, Crime, Drama, Thri...	1959

	Adventure	Animation	Children	Comedy	Fantasy	Drama	Action	Crime	Thriller	Mystery	Sci-Fi	IMAX
0	1.0	1.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	1.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	1.0	0.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0
3	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0
4	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
5	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0
6	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	1.0	1.0	0.0
7	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0	1.0
8	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0
9	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
10	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0

	Adventure	Animation	Children	Comedy	Fantasy	Drama	Action	Crime	Thriller	Mystery	Sci-Fi	IMAX
0	1.0	1.0	1.0	1.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	1.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	1.0	0.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0
3	1.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0
4	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0
5	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0
6	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	1.0	1.0	0.0
7	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	0.0	0.0	0.0	1.0
8	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0
9	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
10	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0